from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))
from IPython.display import IFrame
import pandas as pd
import numpy as np
# from plotly.offline import init_notebook_mode, iplot
# import cufflinks as cf
# init_notebook_mode()
# cf.go_offline()
from __future__ import division
import xgboost as xgb
import sys
sys.path.insert(0,'../')
from utils.paths import *
print path_SBA
!aws s3 ls --human-readable s3://eh-home/ehda-calvin/SBA_study/
# Preprocessing
import preprocessing as pp
reload(pp)
nat = pd.read_csv(path_SBA + 'SBAnational_new.csv', sep = ';', low_memory=False)
# Add job related features
nat['Expanding'] = nat.CreateJob.apply(pp.expanding)
nat['Retaining'] = nat.CreateJob.apply(pp.retaining)
nat['Expanding_ratio'] = nat.apply(lambda x: pp.expanding_ratio(x['CreateJob'], x['NoEmp']), axis= 1).value_counts()
nat['Retaining_ratio'] = nat.apply(lambda x: retaining_ratio(x['RetainedJob'], x['NoEmp']), axis= 1).value_counts()
use_col = ['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Bank', 'BankState',
'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist',
'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'RevLineCr',
'LowDoc', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross',
'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv',
'default', 'Zip5d', 'Zip3d', 'SBA_ratio', 'RealEstate',
'NAICS_default_rate', 'NAICS_group', 'suffix', 'Loan_age', 'Previous_loan', 'default_times',
'Expanding', 'Retaining', 'Expanding_ratio', 'Retaining_ratio'
]
print nat.shape
nat = nat[use_col]
print nat.shape
# nat[use_col].head().T
# Train & test data in 2003 and 2004, predict in 2005
nat34 = nat[nat.ApprovalFY.isin([2003, 2004])].reset_index(drop = True)
nat5 = nat[nat.ApprovalFY.isin([2005])].reset_index(drop = True)
print nat34.shape, nat5.shape
def extract_train_features(features, drop, categorical):
print('-----> Extract train features <------')
print 'dropping unwanted columns'
features = features.drop(drop, axis=1)
print 'transforming categorical variables'
dict_categorical = {}
for col in categorical:
cat = pd.Categorical(features[col])
new_col = col[:-1]+'INT'
if col[-2] == '_':
new_col = col[:-1]+'INT'
else:
new_col = col+'_INT'
features.loc[:, new_col] = cat.codes
dict_categorical[col] = dict([(k, v) for v, k in enumerate(cat.categories)])
features = features.drop(categorical, axis=1)
print 'done'
return dict_categorical, features
def extract_test_features(test, drop, categorical, dict_categorical):
print('-----> Extract test features <------')
print 'dropping unwanted columns'
test=test.drop(drop, axis=1)
print 'transforming categorical variabless'
for col in categorical:
new_col = col[:-1]+'INT'
if col[-2] == '_':
new_col = col[:-1]+'INT'
else:
new_col = col+'_INT'
test[new_col] = test[col].map(dict_categorical[col])
test[new_col].fillna(-1, inplace=True)
test=test.drop(categorical, axis=1)
print 'done'
return test
nat34.head()
from sklearn import model_selection
Train, Test = model_selection.train_test_split(nat34,
test_size = 0.25,
random_state = 1868,
stratify = nat34.default
)
print Train.shape, Test.shape
print Train.default.sum(), Test.default.sum()
print Train.default.sum()/Train.shape[0], Test.default.sum()/Test.shape[0]
Train.head().T
print Train.columns.tolist()
# Preprocessing train set
features = Train
target = Train.default
drop = ['LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'ApprovalFY', 'ChgOffDate', 'DisbursementDate',
'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'SBA_ratio',
'default', 'FranchiseCode', 'Term', 'NAICS']
categorical = ['City', 'State', 'Zip5d', 'Zip3d', 'Bank', 'BankState', 'RevLineCr',
'LowDoc', 'NAICS_group', 'suffix',
'Expanding_ratio', 'Retaining_ratio'
]
dict_categorical, features = extract_train_features(features,
drop,
categorical)
print features.shape
print target.sum()
features.head()
X_train, X_test, y_train, y_test = model_selection.train_test_split(features,
target,
test_size = 0.25,
random_state=3776,
stratify=target
)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
dtest = xgb.DMatrix(X_test.values, y_test.values)
num_rounds = 1100
# num_rounds = 2000
params = {'silent':1,
'eta':0.01,
'max_depth':10,
'subsample': 0.7,
'colsample_bytree': 0.6,
'min_child_weight':1,
'objective':'binary:logistic',
'eval_metric':'auc',
'seed':2017,
'gamma':0.1,
'nthread':-1}
watchlist = [(dtrain, 'train'),(dtest,'validation')]
bst=xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds = 50, verbose_eval = True);
num_rounds = bst.best_iteration
print num_rounds
# Use all the train data to train the model
X_train_matrix = features.values
#SKLEARN
clf_xgb = xgb.XGBClassifier(silent = params['silent'],
learning_rate = params['eta'],
max_depth = params['max_depth'],
subsample = params['subsample'],
colsample_bytree = params['colsample_bytree'],
min_child_weight = params['min_child_weight'],
objective = params['objective'],
n_estimators = num_rounds,
seed = params['seed'],
nthread = params['nthread'],
gamma = params['gamma']
)
clf_xgb.fit(X_train_matrix,
target,
eval_metric ='auc')
# Preprocessing test set
test_X = Test.copy()
# Preprocessing
# target = plu_n_test.Target
drop = ['LoanNr_ChkDgt', 'Name', 'ApprovalDate', 'ApprovalFY', 'ChgOffDate', 'DisbursementDate',
'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'SBA_ratio',
'default', 'FranchiseCode', 'Term', 'NAICS']
categorical = ['City', 'State', 'Zip5d', 'Zip3d', 'Bank', 'BankState', 'RevLineCr',
'LowDoc', 'NAICS_group', 'suffix',
'Expanding_ratio', 'Retaining_ratio'
]
test_bas = extract_test_features(test_X,
drop,
categorical,
dict_categorical)
# Prediction
for col in features.columns:
if col not in test_bas.columns:
print 'MISSING COLUMN: ',col
test_bas= test_bas[features.columns]
X_test_matrix = test_bas.values
print X_train_matrix.shape, X_test_matrix.shape
y_pred_xgb = clf_xgb.predict_proba(X_test_matrix)
temp = pd.DataFrame(y_pred_xgb)
kernix_check = test_X[['LoanNr_ChkDgt', 'Name', 'ApprovalFY', 'State', 'default', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'SBA_ratio']]
kernix_check.loc[:, 'prob'] = y_pred_xgb[:,1]
kernix_check.head()
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn
import matplotlib.pyplot as plt
seaborn.set_style('darkgrid')
def __to_percent1(y, position):
y = y * 100.0
return "{:.1f}%".format(y)
def plot_roc(target, predicted_proba, title, save_png=''):
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, _ = roc_curve(target, predicted_proba)
auc_plot = roc_auc_score(target, predicted_proba)
plt.figure()
plt.plot(fpr, tpr, '-', alpha=.8, color='red', lw=1.5, label= title + ' (auc = %0.3f)' % auc_plot)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Chance')
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.gca().xaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
plt.xlabel('Non default cases', fontsize=15)
plt.ylabel('Default cases', fontsize=15)
plt.title("\nROC curve - {}\n".format(title), fontsize=18)
plt.legend(loc="lower right", fontsize=15)
if save_png != '':
plt.savefig(save_png, format="png")
else:
plt.show()
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 10)
plot_roc(kernix_check.default, kernix_check.prob, 'Test')
# Feature Importance
#BOOSTER
dtrain_ex=xgb.DMatrix(features.values,
label=target.values,
feature_names=features.columns)
bst_ex=xgb.train(params,
dtrain_ex,
num_boost_round=bst.best_iteration,
verbose_eval=False
)
bst_ex.feature_names[:10]
def plot_features_importance(bst):
x = bst.get_fscore()
sorted_x = sorted(x.items(), key=lambda x: x[1], reverse=True)
keys_max = [item[0] for item in sorted_x[:30]]
feat_max = {key: x[key] for key in keys_max}
fig, ax = plt.subplots(1, 1, figsize=(20, 15))
xgb.plot_importance(feat_max, ax=ax)
def print_features_importance(bst):
x = bst.get_fscore()
sorted_x = sorted(x.items(), key=lambda x: x[1], reverse=True)
keys_max = [item[0] for item in sorted_x[:30]]
feat_max = {key: x[key] for key in keys_max}
features_importance = pd.DataFrame([feat_max]).T
features_importance = features_importance.rename(columns = {0: 'Score'})
features_importance = features_importance.sort_values('Score', ascending=False)
return features_importance
plot_features_importance(bst_ex)
feat_max = print_features_importance(bst_ex)
feat_max.rename(columns = {'Score': 'Accumlated score'}).head(15)
# Tuning grades
kernix_check.prob.hist(bins = 100)
np.log(kernix_check.prob).hist(bins = 100)
def tuning_grades(num_grades, prob):
Percentile = list(np.linspace(0, 100, num_grades+1))
thresholds = [np.percentile(prob, i) for i in Percentile]
thresholds[0] = 0
thresholds[-1] = 1
thresholds = [round(i, 3) for i in thresholds]
return thresholds
prob_th = tuning_grades(5, kernix_check.prob)
prob_th
grades = [str(g) for g in range(1,6)]
kernix_check.loc[:, 'Grade'] = pd.cut(kernix_check.prob, bins=prob_th, labels=grades)
kernix_check.loc[:, 'Grade'] = kernix_check['Grade'].astype('int')
kernix_check.Grade.value_counts().sort_index()
def plot_grade_roc(target, grade, predicted_proba, title, save_png=''):
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, _ = roc_curve(target, predicted_proba)
fpr_plot, tpr_plot, _ = roc_curve(target, grade)
raw_auc_plot = roc_auc_score(target, predicted_proba)
new_grade_auc_plot = roc_auc_score(target, grade)
plt.figure()
plt.plot(fpr, tpr, '-', color='grey', alpha=.3, label="Raw PD (auc = %0.3f)" % raw_auc_plot)
plt.plot(fpr_plot, tpr_plot, 'o-', color='red', alpha=.8, lw=1.5, label= title + ' (auc = %0.3f)' % new_grade_auc_plot)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Chance')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.gca().xaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
plt.gca().yaxis.set_major_formatter(mtick.FuncFormatter(__to_percent1))
plt.xlabel('Non wasted policies', fontsize=15)
plt.ylabel('Wasted policies', fontsize=15)
plt.title("\nROC curve - {}\n".format(title), fontsize=18)
plt.legend(loc="lower right", fontsize=15)
bbox_props = dict(boxstyle="circle,pad=0.3", fc="white", ec="#2769a6", lw=1)
bbox_props2 = dict(boxstyle="circle,pad=0.3", fc="white", ec="red", lw=1)
bbox_props3 = dict(boxstyle="circle,pad=0.3", fc="white", ec="blue", lw=1)
for i in range(0,6):
if i >= 1 and i <= 6:
try:
plt.text(fpr_plot[i] - .01, tpr_plot[i] + .05, "%s" % (6 - i), color="red", ha="center", va="center", size=15, bbox=bbox_props2)
except:
pass
if save_png != '':
plt.savefig(save_png, format="png")
plt.show()
plot_grade_roc(kernix_check.default, kernix_check.Grade, kernix_check.prob, 'Test')
kernix_check.groupby('Grade').default.sum()/kernix_check.Grade.value_counts()
kernix_check.head()
kernix_check.groupby('Grade').ChgOffPrinGr.sum()
For the banks, if they use machine learning to classify the loan then they may avoid some default especially for grade 5 companies.
def bank_loss(chgoffamount, sba_app):
if chgoffamount == 0:
return 0
else:
loss = chgoffamount - sba_app
if loss < 0:
loss = 0
return loss
kernix_check.loc[:, 'Bank_loss'] = kernix_check.apply(lambda x: bank_loss(x['ChgOffPrinGr'], x['SBA_Appv']), axis = 1)
kernix_check.groupby('Grade').Bank_loss.sum()
Given that some default is covered bny SBA, the actual loss for banks for each grade is much lower!
(United States Small Business Administration)
SBA is a government agency and their objective is to help small business go through difficult period.
Let's change the role of the SBA, let say they are an credit insurance provider and let's say they get 10% of providing insurance to the amount they insure.
kernix_check.groupby('Grade').SBA_ratio.mean()
It seem like the SBA is able to pick good company to guarantee loan but fail to avoid claim.
# Let's say the premium is 10% of the insure amount
kernix_check.groupby('Grade').SBA_ratio.mean()
def sba_claim(chgoffamount, sba_app):
if chgoffamount == 0:
return 0
else:
if sba_app <= chgoffamount:
claim = sba_app
else:
claim = chgoffamount
return claim
kernix_check.loc[:, 'SBA_claim'] = kernix_check.apply(lambda x: sba_claim(x['ChgOffPrinGr'], x['SBA_Appv']), axis = 1)
# claim
kernix_check.groupby('Grade').SBA_claim.sum()
# Premium
print 'Premium in each grade '
kernix_check.groupby('Grade').SBA_Appv.sum()*0.1
# Gain and loss in each grade
kernix_check.groupby('Grade').SBA_Appv.sum()*0.1 - kernix_check.groupby('Grade').SBA_claim.sum()
# Total premium:
print 'Total premium: ', kernix_check.SBA_Appv.sum()*0.1
print 'Total claim : ', kernix_check.SBA_claim.sum()
print 'Net profit: ', kernix_check.SBA_Appv.sum()*0.1 - kernix_check.SBA_claim.sum()
# Test.SBA_ratio.value_counts().sort_index()
def sba_ratio_ml(grade):
new_ratio = {1: 1,
2: 0.8,
3: 0.6,
4: 0.4,
5: 0.2}
return new_ratio[grade]
kernix_check.loc[:, 'SBA_ratio_ml'] = kernix_check.Grade.apply(sba_ratio_ml)
kernix_check.loc[:, 'SBA_Appv_ml'] = kernix_check.GrAppv * kernix_check.SBA_ratio_ml
# claim
kernix_check.loc[:, 'SBA_claim_ml'] = kernix_check.apply(lambda x: sba_claim(x['ChgOffPrinGr'], x['SBA_Appv_ml']), axis = 1)
kernix_check.groupby('Grade').SBA_claim_ml.sum()
# Premium
print 'Premium in each grade '
kernix_check.groupby('Grade').SBA_Appv_ml.sum()*0.1
# Gain and loss in each grade
kernix_check.groupby('Grade').SBA_Appv_ml.sum()*0.1 - kernix_check.groupby('Grade').SBA_claim_ml.sum()
# Total premium:
print 'Total premium: ', kernix_check.SBA_Appv_ml.sum()*0.1
print 'Total claim : ', kernix_check.SBA_claim_ml.sum()
print 'Net profit: ', kernix_check.SBA_Appv_ml.sum()*0.1 - kernix_check.SBA_claim_ml.sum()
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode()
cf.go_offline()
(kernix_check.groupby('Grade')[['SBA_Appv', 'SBA_Appv_ml']].sum()*0.1).iplot(kind = 'bar',
title = 'Premium change with machine learning', yTitle = 'USD', xTitle = 'Grades')
(kernix_check.groupby('Grade')[['SBA_claim', 'SBA_claim_ml']].sum()).iplot(kind = 'bar',
title = 'Claim change with machine learning', yTitle = 'USD', xTitle = 'Grades')
import eli5
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="weight")
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="gain")
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="cover")
eli5.show_prediction(bst_ex, test_bas.iloc[iloc], show_feature_values=True)
# kernix_check.head()
Test[Test.default == 1].head()
Test[Test.LoanNr_ChkDgt == 7461474001].iloc[[0]].index
def explain_grade(loan_number, show_data = True, chart = True, model = bst_ex,
testset = Test, feeding_data = test_bas, result = kernix_check):
testset = testset[testset.LoanNr_ChkDgt == loan_number]
feeding_data = feeding_data.loc[testset.index]
result = result[result.LoanNr_ChkDgt == loan_number]
# display(testset)
# display(feeding_data)
# display(result)
# display(eli5.show_prediction(model, feeding_data.iloc[0], show_feature_values=True))
df = eli5.explain_prediction_df(model, feeding_data.iloc[0])
feature_groups = {'Location': ['City_INT', 'UrbanRural', 'Zip3d_INT', 'Zip5d_INT', 'State_INT'],
'Sector': ['NAICS_group_INT', 'NAICS_default_rate'],
'Business': ['RealEstate', 'Franchise', 'NewExist'],
'Employees': ['CreateJob', 'RetainedJob', 'NoEmp'],
'Loan': ['BankState_INT', 'Bank_INT', 'LowDoc_INT', 'RevLineCr_INT']
}
group_contribution = {}
for k in feature_groups.keys():
group_contribution[k] = df[df.feature.isin(feature_groups[k])].weight.sum()
return group_contribution
x = explain_grade(6920774003)
x
feature_groups = {'Location': ['City_INT', 'UrbanRural', 'Zip3d_INT', 'Zip5d_INT', 'State_INT'],
'Sector': ['NAICS_group_INT', 'NAICS_default_rate'],
'Business': ['RealEstate', 'Franchise', 'NewExist'],
'Employees': ['CreateJob', 'RetainedJob', 'NoEmp'],
'Loan': ['BankState_INT', 'Bank_INT', 'LowDoc_INT', 'RevLineCr_INT']
}
x[x.feature.isin(feature_groups['Location'])].weight.sum()
test_bas.head()
kernix_check.head()
Test[Test.default == 1].sample(10).T
!ls ../large_data_files/ASA_loan_data/